package WebPAC::Output::KinoSearch;

use warnings;
use strict;

use base qw/WebPAC::Common/;

use KinoSearch::InvIndexer;
use KinoSearch::Analysis::PolyAnalyzer;
use Encode qw/from_to/;
use Data::Dumper;
use Storable;

=head1 NAME

WebPAC::Output::KinoSearch - Create KinoSearch full text index

=head1 VERSION

Version 0.03

=cut

our $VERSION = '0.03';

=head1 SYNOPSIS

Create full text index using KinoSearch index from data with
type C<search>.

=head1 FUNCTIONS

=head2 new

Open KinoSearch index

 my $est = new WebPAC::Output::KinoSearch(
 	index_path => '/path/to/invindex',
	fields => qw/name of all filelds used/,
	database => 'demo',
	label => 'node label',
	encoding => 'iso-8859-2',
	clean => 1,
 );

Options are:

=over 4

=item index_path

path to KinoSearch index to use

=item fields

name of all fields used in this index

=item database

name of database from which data comes

=item label

label for node (optional)

=item encoding

character encoding of C<data_structure> if it's differenet than C<ISO-8859-2>
(and it probably is). This encoding will be converted to C<UTF-8> for
index.

=back

=cut

sub new {
	my $class = shift;
	my $self = {@_};
	bless($self, $class);

	my $log = $self->_get_logger;

	#$log->debug("self: ", sub { Dumper($self) });

	foreach my $p (qw/index_path fields database/) {
		$log->logdie("need $p") unless ($self->{$p});
	}

	$log->logdie("fields is not ARRAY") unless (ref($self->{fields}) eq 'ARRAY');

	$self->{encoding} ||= 'ISO-8859-2';

	$self->{index_path} .= '/' . $self->{database};

	$self->{clean} = 1 if (! -e $self->{index_path} . '/segments');

	$log->info("using", $self->{clean} ? ' new' : '', " index $self->{index_path} with encoding $self->{encoding}");

	my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en' );

	$self->{invindex} = KinoSearch::InvIndexer->new(
		invindex => $self->{index_path},
		create   => $self->{clean},
		analyzer => $analyzer,
	);

	my $fields_path = $self->{index_path} . '/fields.storable';
	$fields_path =~ s#//#/#g;
	if (-e $fields_path) {
		$self->{fields} = retrieve($fields_path) ||
			$log->warn("can't open $fields_path: $!");
	} else {
		$log->error("This will be dummy run since no fields statistics are found!");
		$log->error("You will have to re-run indexing to get search results!");
		$self->{dummy_run} = 1;
	}
	$self->{fields_path} = $fields_path;

	foreach my $f (@{ $self->{fields} }) {
		$self->{invindex}->spec_field( 
			name  => $f,
#			boost => 10,
			stored => 1,
			indexed => 1,
			vectorized => 0,
		);
	}

	$self ? return $self : return undef;
}


=head2 add

Adds one entry to database.

  $est->add(
  	id => 42,
	ds => $ds,
	type => 'display',
	text => 'optional text from which snippet is created',
  );

This function will create  entries in index using following URI format:

  C<file:///type/database%20name/000>

Each tag in C<data_structure> with specified C<type> will create one
attribute and corresponding hidden text (used for search).

=cut

sub add {
	my $self = shift;

	my $args = {@_};

	my $log = $self->_get_logger;

	my $database = $self->{'database'} || $log->logconfess('no database in $self');
	$log->logconfess('need invindex in object') unless ($self->{'invindex'});

	foreach my $p (qw/id ds type/) {
		$log->logdie("need $p") unless ($args->{$p});
	}

	my $type = $args->{'type'};
	my $id = $args->{'id'};

	my $uri = "file:///$type/$database/$id";
	$log->debug("creating $uri");

	my $doc = $self->{invindex}->new_doc( $uri ) || $log->logdie("can't create new_doc( $uri )");

	sub _add_value($$$$$) {
		my ($self,$log,$doc,$n,$v) = @_;
		return unless ($v);

		$self->{value_usage}->{$n}++;
		return if ($self->{dummy_run});

		eval { $doc->set_value($n, $self->convert($v) ) };
		$log->warn("can't insert: $n = $v") if ($@);
	}

	_add_value($self,$log,$doc, 'uri', $uri);

	$log->debug("ds = ", sub { Dumper($args->{'ds'}) } );

	# filter all tags which have type defined
	my @tags = grep {
		ref($args->{'ds'}->{$_}) eq 'HASH' && defined( $args->{'ds'}->{$_}->{$type} )
	} keys %{ $args->{'ds'} };

	$log->debug("tags = ", join(",", @tags));

	return unless (@tags);

	foreach my $tag (@tags) {

		my $vals = join(" ", @{ $args->{'ds'}->{$tag}->{$type} });

		next if (! $vals);

		$vals = $self->convert( $vals ) or
			$log->logdie("can't convert '$vals' to UTF-8");

		_add_value($self, $log, $doc, $tag, $vals );
	}

	if (my $text = $args->{'text'}) {
		_add_value($self, $log, $doc, 'bodytext', $text );
	}

	#$log->debug("adding ", sub { $doc->dump_draft } );
	$self->{invindex}->add_doc($doc) || $log->warn("can't add document $uri");

	return 1;
}

=head2 finish

Close index

 $index->finish;

=cut

sub finish {
	my $self = shift;

	my $log = $self->_get_logger();

	$log->info("finish index writing to disk");
	$self->{invindex}->finish;

	$log->info("writing value usage file");

	# add fields from last run
	map { $self->{value_usage}->{$_}++ } @{ $self->{fields} };

	my @fields = keys %{ $self->{value_usage} };
	store \@fields, $self->{fields_path} ||
		$log->warn("can't write $self->{fields_path}: $!");

}

=head2 convert

 my $utf8_string = $self->convert('string in codepage');

=cut

sub convert {
	my $self = shift;

	my $text = shift || return;
	from_to($text, $self->{encoding}, 'UTF-8');
	return $text;
}

=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::Output::Estraier
